@article{devlin2018bert,
  title={Bert: Pre-training of deep bidirectional transformers for language understanding},
  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  journal={arXiv preprint arXiv:1810.04805},
  year={2018}
}

@article{shoeybi2019megatron,
  title={Megatron-lm: Training multi-billion parameter language models using model parallelism},
  author={Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan},
  journal={arXiv preprint arXiv:1909.08053},
  year={2019}
}

@InProceedings{maas2011,
  author    = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}

@inproceedings{socher2013,
    title = "Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank",
    author = "Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang, Jason and Manning, Christopher D. and Ng, Andrew and Potts, Christopher",
    booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
    month = oct,
    year = "2013",
    address = "Seattle, Washington, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D13-1170",
    pages = "1631--1642",
}

@article{lim2018chemical,
  title={Chemical--gene relation extraction using recursive neural network},
  author={Lim, Sangrak and Kang, Jaewoo},
  journal={Database},
  volume={2018},
  year={2018},
  publisher={Oxford Academic}
}

@inproceedings{li2007scalable,
  title={Scalable term selection for text categorization},
  author={Li, Jingyang and Sun, Maosong},
  booktitle={Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP-CoNLL)},
  pages={774--782},
  year={2007}
}

@misc{lee2019biobert,
    title={BioBERT: a pre-trained biomedical language representation model for biomedical text mining},
    author={Jinhyuk Lee and Wonjin Yoon and Sungdong Kim and Donghyeon Kim and Sunkyu Kim and Chan Ho So and Jaewoo Kang},
    year={2019},
    eprint={1901.08746},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@misc{shin2020biomegatron,
      title={BioMegatron: Larger Biomedical Domain Language Model},
      author={Hoo-Chang Shin and Yang Zhang and Evelina Bakhturina and Raul Puri and Mostofa Patwary and Mohammad Shoeybi and Raghav Mani},
      year={2020},
      eprint={2010.06060},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{vaswani2017attention,
  title={Attention is all you need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  booktitle={Advances in Neural Information Processing Systems},
  pages={6000--6010},
  year={2017}
}

@article{sennrich2015neural,
  title={Neural machine translation of rare words with subword units},
  author={Sennrich, Rico and Haddow, Barry and Birch, Alexandra},
  journal={arXiv preprint arXiv:1508.07909},
  year={2015}
}

@article{provilkov2019bpe,
  title={Bpe-dropout: Simple and effective subword regularization},
  author={Provilkov, Ivan and Emelianenko, Dmitrii and Voita, Elena},
  journal={arXiv preprint arXiv:1910.13267},
  year={2019}
}

@article{post2018call,
  title={A call for clarity in reporting BLEU scores},
  author={Post, Matt},
  journal={arXiv preprint arXiv:1804.08771},
  year={2018}
}

@misc{zhang2021sgdqa,
      title={SGD-QA: Fast Schema-Guided Dialogue State Tracking for Unseen Services},
      author={Yang Zhang and Vahid Noroozi and Evelina Bakhturina and Boris Ginsburg},
      year={2021},
      eprint={2105.08049},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@article{zhang2019neural,
  title={Neural Models of Text Normalization for Speech Applications},
  author={Hao Zhang and R. Sproat and Axel H. Ng and Felix Stahlberg and Xiaochang Peng and Kyle Gorman and B. Roark},
  journal={Computational Linguistics},
  year={2019},
  pages={293-338}
}

@misc{liu2021selfalignment,
      title={Self-Alignment Pretraining for Biomedical Entity Representations}, 
      author={Fangyu Liu and Ehsan Shareghi and Zaiqiao Meng and Marco Basaldella and Nigel Collier},
      year={2021},
      eprint={2010.11784},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
 }

@article{gulcehre2015using,
  title={On using monolingual corpora in neural machine translation},
  author={Gulcehre, Caglar and Firat, Orhan and Xu, Kelvin and Cho, Kyunghyun and Barrault, Loic and Lin, Huei-Chi and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua},
  journal={arXiv preprint arXiv:1503.03535},
  year={2015}
}

@article{yee2019simple,
  title={Simple and effective noisy channel modeling for neural machine translation},
  author={Yee, Kyra and Ng, Nathan and Dauphin, Yann N and Auli, Michael},
  journal={arXiv preprint arXiv:1908.05731},
  year={2019}
}

@inproceedings{koehnetal2007moses,
    title = "{M}oses: Open Source Toolkit for Statistical Machine Translation",
    author = "Koehn, Philipp  and
      Hoang, Hieu  and
      Birch, Alexandra  and
      Callison-Burch, Chris  and
      Federico, Marcello  and
      Bertoldi, Nicola  and
      Cowan, Brooke  and
      Shen, Wade  and
      Moran, Christine  and
      Zens, Richard  and
      Dyer, Chris  and
      Bojar, Ond{\v{r}}ej  and
      Constantin, Alexandra  and
      Herbst, Evan",
    booktitle = "Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics Companion Volume Proceedings of the Demo and Poster Sessions",
    month = jun,
    year = "2007",
    address = "Prague, Czech Republic",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/P07-2045",
    pages = "177--180",
}

@inproceedings{sunkara20_interspeech,
  author={Monica Sunkara and Srikanth Ronanki and Dhanush Bekal and Sravan Bodapati and Katrin Kirchhoff},
  title={{Multimodal Semi-Supervised Learning Framework for Punctuation Prediction in Conversational Speech}},
  year=2020,
  booktitle={Proc. Interspeech 2020},
  pages={4911--4915},
  doi={10.21437/Interspeech.2020-3074}
}

@article{chen2019bert,
  title={Bert for joint intent classification and slot filling},
  author={Chen, Qian and Zhuo, Zhu and Wang, Wen},
  journal={arXiv preprint arXiv:1902.10909},
  year={2019}
}

@article{borgeaud2021improving,
  title={Improving language models by retrieving from trillions of tokens},
  author={Borgeaud, Sebastian and Mensch, Arthur and Hoffmann, Jordan and Cai, Trevor and Rutherford, Eliza and Millican, Katie and Driessche, George van den and Lespiau, Jean-Baptiste and Damoc, Bogdan and Clark, Aidan and others},
  journal={arXiv preprint arXiv:2112.04426},
  year={2021}
}

@article{su2021roformer,
  title={Roformer: Enhanced transformer with rotary position embedding},
  author={Su, Jianlin and Lu, Yu and Pan, Shengfeng and Wen, Bo and Liu, Yunfeng},
  journal={arXiv preprint arXiv:2104.09864},
  year={2021}
}

@article{reimers2019sentence,
  title={Sentence-bert: Sentence embeddings using siamese bert-networks},
  author={Reimers, Nils and Gurevych, Iryna},
  journal={arXiv preprint arXiv:1908.10084},
  year={2019}
}

@article{yang2022tensor,
  title={Tensor Programs V: Tuning Large Neural Networks via Zero-Shot Hyperparameter Transfer},
  author={Yang, Greg and Hu, Edward J and Babuschkin, Igor and Sidor, Szymon and Liu, Xiaodong and Farhi, David and Ryder, Nick and Pachocki, Jakub and Chen, Weizhu and Gao, Jianfeng},
  journal={arXiv preprint arXiv:2203.03466},
  year={2022}
}

@article{jegou2022faiss,
  title={Faiss: Similarity search and clustering of dense vectors library},
  author={J{\'e}gou, Herv{\'e} and Douze, Matthijs and Johnson, Jeff and Hosseini, Lucas and Deng, Chengqi},
  journal={Astrophysics Source Code Library},
  pages={ascl--2210},
  year={2022}
}

@misc{antonova2023spellmapper,
  title={SpellMapper: A non-autoregressive neural spellchecker for ASR customization with candidate retrieval based on n-gram mappings}, 
  author={Alexandra Antonova and Evelina Bakhturina and Boris Ginsburg},
  year={2023},
  eprint={2306.02317},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{dao2022flashattention,
      title={FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness}, 
      author={Tri Dao and Daniel Y. Fu and Stefano Ermon and Atri Rudra and Christopher Ré},
      year={2022},
      eprint={2205.14135},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{vaswani2023attention,
      title={Attention Is All You Need}, 
      author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
      year={2023},
      eprint={1706.03762},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{su2022roformer,
      title={RoFormer: Enhanced Transformer with Rotary Position Embedding}, 
      author={Jianlin Su and Yu Lu and Shengfeng Pan and Ahmed Murtadha and Bo Wen and Yunfeng Liu},
      year={2022},
      eprint={2104.09864},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{press2022train,
      title={Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation}, 
      author={Ofir Press and Noah A. Smith and Mike Lewis},
      year={2022},
      eprint={2108.12409},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{chi2022kerple,
      title={KERPLE: Kernelized Relative Positional Embedding for Length Extrapolation}, 
      author={Ta-Chung Chi and Ting-Han Fan and Peter J. Ramadge and Alexander I. Rudnicky},
      year={2022},
      eprint={2205.09921},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{sun2022lengthextrapolatable,
      title={A Length-Extrapolatable Transformer}, 
      author={Yutao Sun and Li Dong and Barun Patra and Shuming Ma and Shaohan Huang and Alon Benhaim and Vishrav Chaudhary and Xia Song and Furu Wei},
      year={2022},
      eprint={2212.10554},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{chi2023dissecting,
      title={Dissecting Transformer Length Extrapolation via the Lens of Receptive Field Analysis}, 
      author={Ta-Chung Chi and Ting-Han Fan and Alexander I. Rudnicky and Peter J. Ramadge},
      year={2023},
      eprint={2212.10356},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{shaw2018selfattention,
      title={Self-Attention with Relative Position Representations}, 
      author={Peter Shaw and Jakob Uszkoreit and Ashish Vaswani},
      year={2018},
      eprint={1803.02155},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{chen2023extending,
      title={Extending Context Window of Large Language Models via Positional Interpolation}, 
      author={Shouyuan Chen and Sherman Wong and Liangjian Chen and Yuandong Tian},
      year={2023},
      eprint={2306.15595},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}